Functional programming

data_crime = read.table("citycrime.txt", header = TRUE)

Sapply, apply, vapply and lapply

##     Murder       Rape    Robbery    Assault   Burglary    Larceny 
##   20.88056   70.04583  637.56250  831.63056 1715.89167 4706.79722 
##        MVT 
## 1377.92500
##     Murder       Rape    Robbery    Assault   Burglary    Larceny 
##   20.88056   70.04583  637.56250  831.63056 1715.89167 4706.79722 
##        MVT 
## 1377.92500
##     Murder       Rape    Robbery    Assault   Burglary    Larceny 
##   20.88056   70.04583  637.56250  831.63056 1715.89167 4706.79722 
##        MVT 
## 1377.92500
##     Murder       Rape    Robbery    Assault   Burglary    Larceny 
##   20.88056   70.04583  637.56250  831.63056 1715.89167 4706.79722 
##        MVT 
## 1377.92500
## $Murder
## [1] 20.88056
## 
## $Rape
## [1] 70.04583
## 
## $Robbery
## [1] 637.5625
## 
## $Assault
## [1] 831.6306
## 
## $Burglary
## [1] 1715.892
## 
## $Larceny
## [1] 4706.797
## 
## $MVT
## [1] 1377.925

Descriptive statistics

summary_stats = function(x) {
  return(c(mean = mean(x),
              sd = sd(x),
              max = max(x),
              min = min(x),
              med = median(x)))
}

crime_stats = lapply(data_crime, summary_stats)

Cities with highest crime rates

cities = rownames(data_crime)

max_city = function(x) {
  cities[which.min(x)]
}
sapply(data_crime, max_city)
##           Murder             Rape          Robbery          Assault 
##       "Honolulu"      "Santa-Ana"       "Honolulu" "Virginia-Beach" 
##         Burglary          Larceny              MVT 
##       "San-Jose"       "San-Jose" "Virginia-Beach"

Parallel

#library(parallel)
#wait = function() {
#  Sys.sleep(3)
# }
# no_cores <- detectCores() - 1
# cl <- makeCluster(no_cores)
# 
# parLapply(cl, 1:no_cores, wait())
# 
# stopCluster(cl)

Map function - weighted average

cost_per_city = replicate(7, runif(length(cities)), simplify = FALSE) 
Map(weighted.mean, data_crime, cost_per_city)
## $Murder
## [1] 20.3482
## 
## $Rape
## [1] 68.09087
## 
## $Robbery
## [1] 610.0875
## 
## $Assault
## [1] 805.6186
## 
## $Burglary
## [1] 1682.386
## 
## $Larceny
## [1] 4779.669
## 
## $MVT
## [1] 1411.044

Reduce and filter

top10_cities = lapply(data_crime, function(x) cities[order(x, decreasing = T)[1:10]])

intersect(intersect(intersect(top10_cities$Murder, top10_cities$Rape), top10_cities$Robbery), top10_cities$Assault) #... and so on
## [1] "Atlanta"
Reduce(intersect, top10_cities)
## [1] "Atlanta"
Filter(function(x) "Detroit" %in% x, top10_cities)
## $Murder
##  [1] "New-Orleans" "Washingt"    "St-Louis"    "Detroit"     "Burmingham" 
##  [6] "Atlanta"     "Baltimore"   "Oakland"     "Newark"      "Chicago"    
## 
## $Rape
##  [1] "Minneapolis" "Cleveland"   "Oklahoma"    "Kansas-City" "Memphis"    
##  [6] "Detroit"     "Toledo"      "Columbus"    "Cicinnati"   "Atlanta"    
## 
## $Robbery
##  [1] "Newark"    "St-Louis"  "Miami"     "Baltimore" "Atlanta"  
##  [6] "Detroit"   "Chicago"   "Tampa"     "Washingt"  "Oakland"  
## 
## $MVT
##  [1] "Tampa"      "Fresno"     "Newark"     "Detroit"    "Miami"     
##  [6] "St-Louis"   "Sacramento" "Atlanta"    "Portland"   "Boston"

Position() and Find()

Visualization using ggplot

plot(x = data_crime$Robbery, y = data_crime$Assault)

Quick plot

library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.2.5
qplot(x = Robbery, y = Assault, data = data_crime)

ggplot standard functions

crime_plot = ggplot(data_crime, aes(x = Robbery, y = Assault))
crime_plot + geom_point()

crime_plot + geom_point(aes(color = Murder, size = Larceny))

crime_plot + geom_point(aes(color = Murder)) + 
  scale_colour_gradient(high = "red", low = "blue") + theme_bw()

crime_plot + geom_text(aes(label=rownames(data_crime)), check_overlap = TRUE)

3d plot

#install.packages("plotly")
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
p <- plot_ly(data_crime, x = ~Robbery, y = ~Assault, z = ~Murder)
p
## No trace type specified:
##   Based on info supplied, a 'scatter3d' trace seems appropriate.
##   Read more about this trace type -> https://plot.ly/r/reference/#scatter3d
## No scatter3d mode specifed:
##   Setting the mode to markers
##   Read more about this attribute -> https://plot.ly/r/reference/#scatter-mode

pairs

library(GGally)
## Warning: package 'GGally' was built under R version 3.2.5
## Warning: replacing previous import by 'utils::capture.output' when loading
## 'GGally'
## Warning: replacing previous import by 'utils::head' when loading 'GGally'
## Warning: replacing previous import by 'utils::installed.packages' when
## loading 'GGally'
## Warning: replacing previous import by 'utils::str' when loading 'GGally'
crime_pairs = ggpairs(data_crime,axisLabels = "none",
        upper = list(continuous = "points", combo = "dot"),
        lower = list(continuous = "cor", combo = "dot"),
        diag = list(continuous = "densityDiag")) + 
  theme_bw()
crime_pairs

pdf("crime_ggpairs.pdf")
crime_pairs
dev.off()
## quartz_off_screen 
##                 2

Exercises

Work in pairs to solve the next questions. In all problems, try to use functional programming when possible, and ggplot.

  1. Simulate data from a multivariate normal distribution wih mean 0 and covariance diag(\(3^2, 2^2,1,1,\ldots\)) using p=20 variables and n=100 observations. Compute the eigenvalues of the sample covariance and plot them.

  2. Create a list containing different versions of the simulated data varying the number of observations on each one, from 100 to 2000. For each element in the list, compute the SVD and calculate the running time. Store the results on an array. Plot the running times as a function of the number of observations. (You can use proc.time()).

  3. Repeat the previous problem, but using the eigendecomposition of the covariance matrix.

  4. Repeat 2 and 3, but instead of varying the number of observations, change the number of variables from 100 to 2000.

sample = matrix(rnorm(1000*10), ncol = 10)
p = 20
n = 100
X = sweep(matrix(rnorm(n*p),ncol = p), 2, c(3,2,rep(1,p-2)), "*")
eigenvalues = eigen(cov(X))$values
qplot(y=eigenvalues, xlab="Index", ylab="Eigenvalues")

generate_data = function(n, p) {
  X <- sweep(matrix(rnorm(n*p),ncol = p), 2, c(3,2,rep(1,p-2)), "*")
}

compute_svd = function(X) {
  start = proc.time()
  X = scale(X, center = TRUE, scale = FALSE)
  eigenvalues = svd(X)$d^2/(nrow(X)-1)
  end = proc.time()
  return(end[3] - start[3])
}

data_obs = lapply(seq(100, 5000, 100), generate_data, p = 100)
svd_times = sapply(data_obs, compute_svd) 
qplot(y = svd_times, main = "SVD", xlab = "Observations", ylab = "Running time")

compute_eig = function(X) {
  start = proc.time()
  eigenvalues = eigen(cov(X))$values
  end = proc.time()
  return(end[3] - start[3])
}
eig_times = sapply(data_obs, compute_eig) 
qplot(y = eig_times, main = "Eig", xlab = "Observations", ylab = "Running time")

data_var = lapply(seq(100, 2000, 100), generate_data, n = 100)
svd_times = sapply(data_var, compute_svd) 
qplot(y = svd_times, main = "SVD", xlab = "Variables", ylab = "Running time")

eig_times = sapply(data_var, compute_eig) 
qplot(y = eig_times, main = "Eig", xlab = "Variables", ylab = "Running time")